{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import BertTokenizer\n",
    "from pathlib import Path\n",
    "import torch\n",
    "\n",
    "from box import Box\n",
    "import pandas as pd\n",
    "import collections\n",
    "import os\n",
    "from tqdm import tqdm, trange\n",
    "import sys\n",
    "import random\n",
    "import numpy as np\n",
    "import apex\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "import datetime\n",
    "\n",
    "from fast_bert.modeling import BertForMultiLabelSequenceClassification\n",
    "from fast_bert.data_cls import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features\n",
    "from fast_bert.learner_cls import BertLearner\n",
    "from fast_bert.metrics import accuracy_multilabel, accuracy_thresh, fbeta, roc_auc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "torch.cuda.empty_cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_colwidth', -1)\n",
    "run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_PATH = Path('../data/')\n",
    "LABEL_PATH = Path('../labels/')\n",
    "\n",
    "AUG_DATA_PATH = Path('../data/data_augmentation/')\n",
    "\n",
    "MODEL_PATH=Path('../models/')\n",
    "LOG_PATH=Path('../logs/')\n",
    "MODEL_PATH.mkdir(exist_ok=True)\n",
    "\n",
    "model_state_dict = None\n",
    "\n",
    "# BERT_PRETRAINED_PATH = Path('../../bert_models/pretrained-weights/cased_L-12_H-768_A-12/')\n",
    "BERT_PRETRAINED_PATH = Path('../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12/')\n",
    "# BERT_PRETRAINED_PATH = Path('../../bert_fastai/pretrained-weights/uncased_L-24_H-1024_A-16/')\n",
    "# FINETUNED_PATH = Path('../models/finetuned_model.bin')\n",
    "FINETUNED_PATH = None\n",
    "# model_state_dict = torch.load(FINETUNED_PATH)\n",
    "\n",
    "LOG_PATH.mkdir(exist_ok=True)\n",
    "\n",
    "OUTPUT_PATH = MODEL_PATH/'output'\n",
    "OUTPUT_PATH.mkdir(exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "args = Box({\n",
    "    \"run_text\": \"multilabel toxic comments with freezable layers\",\n",
    "    \"train_size\": -1,\n",
    "    \"val_size\": -1,\n",
    "    \"log_path\": LOG_PATH,\n",
    "    \"full_data_dir\": DATA_PATH,\n",
    "    \"data_dir\": DATA_PATH,\n",
    "    \"task_name\": \"toxic_classification_lib\",\n",
    "    \"no_cuda\": False,\n",
    "    \"bert_model\": BERT_PRETRAINED_PATH,\n",
    "    \"output_dir\": OUTPUT_PATH,\n",
    "    \"max_seq_length\": 512,\n",
    "    \"do_train\": True,\n",
    "    \"do_eval\": True,\n",
    "    \"do_lower_case\": True,\n",
    "    \"train_batch_size\": 8,\n",
    "    \"eval_batch_size\": 16,\n",
    "    \"learning_rate\": 5e-5,\n",
    "    \"num_train_epochs\": 6,\n",
    "    \"warmup_proportion\": 0.0,\n",
    "    \"no_cuda\": False,\n",
    "    \"local_rank\": -1,\n",
    "    \"seed\": 42,\n",
    "    \"gradient_accumulation_steps\": 1,\n",
    "    \"optimize_on_cpu\": False,\n",
    "    \"fp16\": True,\n",
    "    \"fp16_opt_level\": \"O1\",\n",
    "    \"weight_decay\": 0.0,\n",
    "    \"adam_epsilon\": 1e-8,\n",
    "    \"max_grad_norm\": 1.0,\n",
    "    \"max_steps\": -1,\n",
    "    \"warmup_steps\": 500,\n",
    "    \"logging_steps\": 50,\n",
    "    \"eval_all_checkpoints\": True,\n",
    "    \"overwrite_output_dir\": True,\n",
    "    \"overwrite_cache\": False,\n",
    "    \"seed\": 42,\n",
    "    \"loss_scale\": 128,\n",
    "    \"task_name\": 'intent',\n",
    "    \"model_name\": 'xlnet-base-cased',\n",
    "    \"model_type\": 'xlnet'\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import logging\n",
    "\n",
    "logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args[\"run_text\"]))\n",
    "\n",
    "logging.basicConfig(\n",
    "    level=logging.INFO,\n",
    "    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',\n",
    "    datefmt='%m/%d/%Y %H:%M:%S',\n",
    "    handlers=[\n",
    "        logging.FileHandler(logfile),\n",
    "        logging.StreamHandler(sys.stdout)\n",
    "    ])\n",
    "\n",
    "logger = logging.getLogger()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "08/02/2019 09:38:23 - INFO - root -   {'run_text': 'multilabel toxic comments with freezable layers', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('../logs'), 'full_data_dir': PosixPath('../data'), 'data_dir': PosixPath('../data'), 'task_name': 'intent', 'no_cuda': False, 'bert_model': PosixPath('../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12'), 'output_dir': PosixPath('../models/output'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 8, 'eval_batch_size': 16, 'learning_rate': 5e-05, 'num_train_epochs': 6, 'warmup_proportion': 0.0, 'local_rank': -1, 'seed': 42, 'gradient_accumulation_steps': 1, 'optimize_on_cpu': False, 'fp16': True, 'fp16_opt_level': 'O1', 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'max_steps': -1, 'warmup_steps': 500, 'logging_steps': 50, 'eval_all_checkpoints': True, 'overwrite_output_dir': True, 'overwrite_cache': False, 'loss_scale': 128, 'model_name': 'xlnet-base-cased', 'model_type': 'xlnet'}\n"
     ]
    }
   ],
   "source": [
    "logger.info(args)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAINED_PATH, do_lower_case=args['do_lower_case'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "device = torch.device('cuda')\n",
    "if torch.cuda.device_count() > 1:\n",
    "    args.multi_gpu = True\n",
    "else:\n",
    "    args.multi_gpu = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_cols = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fast_bert.prediction import BertClassificationPredictor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "08/02/2019 09:38:23 - INFO - transformers.tokenization_utils -   Model name '../models/output/model_out' not found in model shortcut name list (xlnet-base-cased, xlnet-large-cased). Assuming '../models/output/model_out' is a path or url to a directory containing tokenizer files.\n",
      "08/02/2019 09:38:23 - INFO - transformers.tokenization_utils -   loading file ../models/output/model_out/added_tokens.json\n",
      "08/02/2019 09:38:23 - INFO - transformers.tokenization_utils -   loading file ../models/output/model_out/special_tokens_map.json\n",
      "08/02/2019 09:38:23 - INFO - transformers.tokenization_utils -   loading file ../models/output/model_out/spiece.model\n",
      "08/02/2019 09:38:23 - INFO - transformers.modeling_utils -   loading configuration file ../models/output/model_out/config.json\n",
      "08/02/2019 09:38:23 - INFO - transformers.modeling_utils -   Model config {\n",
      "  \"attn_type\": \"bi\",\n",
      "  \"bi_data\": false,\n",
      "  \"clamp_len\": -1,\n",
      "  \"d_head\": 64,\n",
      "  \"d_inner\": 3072,\n",
      "  \"d_model\": 768,\n",
      "  \"dropout\": 0.1,\n",
      "  \"end_n_top\": 5,\n",
      "  \"ff_activation\": \"gelu\",\n",
      "  \"finetuning_task\": null,\n",
      "  \"initializer_range\": 0.02,\n",
      "  \"layer_norm_eps\": 1e-12,\n",
      "  \"mem_len\": null,\n",
      "  \"n_head\": 12,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_token\": 32000,\n",
      "  \"num_labels\": 6,\n",
      "  \"output_attentions\": false,\n",
      "  \"output_hidden_states\": false,\n",
      "  \"reuse_len\": null,\n",
      "  \"same_length\": false,\n",
      "  \"start_n_top\": 5,\n",
      "  \"summary_activation\": \"tanh\",\n",
      "  \"summary_last_dropout\": 0.1,\n",
      "  \"summary_type\": \"last\",\n",
      "  \"summary_use_proj\": true,\n",
      "  \"torchscript\": false,\n",
      "  \"untie_r\": true\n",
      "}\n",
      "\n",
      "08/02/2019 09:38:23 - INFO - transformers.modeling_utils -   loading weights file ../models/output/model_out/pytorch_model.bin\n"
     ]
    }
   ],
   "source": [
    "predictor = BertClassificationPredictor(args.output_dir/'model_out', args.output_dir, LABEL_PATH, \n",
    "                                        multi_label=True, model_type='xlnet', do_lower_case=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "08/02/2019 09:38:37 - INFO - root -   Writing example 0 of 153164\n",
      "08/02/2019 09:38:43 - INFO - root -   Writing example 10000 of 153164\n",
      "08/02/2019 09:38:50 - INFO - root -   Writing example 20000 of 153164\n",
      "08/02/2019 09:38:57 - INFO - root -   Writing example 30000 of 153164\n",
      "08/02/2019 09:39:04 - INFO - root -   Writing example 40000 of 153164\n",
      "08/02/2019 09:39:11 - INFO - root -   Writing example 50000 of 153164\n",
      "08/02/2019 09:39:18 - INFO - root -   Writing example 60000 of 153164\n",
      "08/02/2019 09:39:26 - INFO - root -   Writing example 70000 of 153164\n",
      "08/02/2019 09:39:33 - INFO - root -   Writing example 80000 of 153164\n",
      "08/02/2019 09:39:39 - INFO - root -   Writing example 90000 of 153164\n",
      "08/02/2019 09:39:47 - INFO - root -   Writing example 100000 of 153164\n",
      "08/02/2019 09:39:54 - INFO - root -   Writing example 110000 of 153164\n",
      "08/02/2019 09:40:01 - INFO - root -   Writing example 120000 of 153164\n",
      "08/02/2019 09:40:08 - INFO - root -   Writing example 130000 of 153164\n",
      "08/02/2019 09:40:16 - INFO - root -   Writing example 140000 of 153164\n",
      "08/02/2019 09:40:23 - INFO - root -   Writing example 150000 of 153164\n"
     ]
    }
   ],
   "source": [
    "output = predictor.predict_batch(list(pd.read_csv(\"../data/test.csv\")['comment_text'].values))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(output).to_csv('../data/output_xlnet.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = pd.read_csv('../data/output_xlnet.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>identity_hate</th>\n",
       "      <th>insult</th>\n",
       "      <th>obscene</th>\n",
       "      <th>severe_toxic</th>\n",
       "      <th>threat</th>\n",
       "      <th>toxic</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.787239</td>\n",
       "      <td>0.970210</td>\n",
       "      <td>0.990423</td>\n",
       "      <td>0.316317</td>\n",
       "      <td>0.015324</td>\n",
       "      <td>0.996634</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000071</td>\n",
       "      <td>0.000166</td>\n",
       "      <td>0.000170</td>\n",
       "      <td>0.000064</td>\n",
       "      <td>0.000041</td>\n",
       "      <td>0.000707</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.000073</td>\n",
       "      <td>0.000178</td>\n",
       "      <td>0.000183</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.000073</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000185</td>\n",
       "      <td>0.000071</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000594</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.000073</td>\n",
       "      <td>0.000175</td>\n",
       "      <td>0.000180</td>\n",
       "      <td>0.000068</td>\n",
       "      <td>0.000044</td>\n",
       "      <td>0.000619</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   identity_hate    insult   obscene  severe_toxic    threat     toxic\n",
       "0  0.787239       0.970210  0.990423  0.316317      0.015324  0.996634\n",
       "1  0.000071       0.000166  0.000170  0.000064      0.000041  0.000707\n",
       "2  0.000073       0.000178  0.000183  0.000070      0.000045  0.000600\n",
       "3  0.000073       0.000179  0.000185  0.000071      0.000045  0.000594\n",
       "4  0.000073       0.000175  0.000180  0.000068      0.000044  0.000619"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>comment_text</th>\n",
       "      <th>toxic</th>\n",
       "      <th>severe_toxic</th>\n",
       "      <th>obscene</th>\n",
       "      <th>threat</th>\n",
       "      <th>insult</th>\n",
       "      <th>identity_hate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0000997932d777bf</td>\n",
       "      <td>Explanation\\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>000103f0d9cfb60f</td>\n",
       "      <td>D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>000113f07ec002fd</td>\n",
       "      <td>Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0001b41b1c6bb37e</td>\n",
       "      <td>\"\\nMore\\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of \"\"types of accidents\"\"  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\\n\\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport  \"</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0001d958c54c6e35</td>\n",
       "      <td>You, sir, are my hero. Any chance you remember what page that's on?</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id  \\\n",
       "0  0000997932d777bf   \n",
       "1  000103f0d9cfb60f   \n",
       "2  000113f07ec002fd   \n",
       "3  0001b41b1c6bb37e   \n",
       "4  0001d958c54c6e35   \n",
       "\n",
       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         comment_text  \\\n",
       "0  Explanation\\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27                                                                                                                                                                                                                                                                                                                                                                            \n",
       "1  D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     \n",
       "2  Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.                                                                                                                                                                                                                                                                                                                                                                                                            \n",
       "3  \"\\nMore\\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of \"\"types of accidents\"\"  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\\n\\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport  \"   \n",
       "4  You, sir, are my hero. Any chance you remember what page that's on?                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  \n",
       "\n",
       "   toxic  severe_toxic  obscene  threat  insult  identity_hate  \n",
       "0  0      0             0        0       0       0              \n",
       "1  0      0             0        0       0       0              \n",
       "2  0      0             0        0       0       0              \n",
       "3  0      0             0        0       0       0              \n",
       "4  0      0             0        0       0       0              "
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df = pd.read_csv(\"../data/train.csv\")\n",
    "test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_df = pd.merge(test_df, preds, how='left', left_index=True, right_index=True)\n",
    "del output_df['comment_text']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = ['id','toxic','severe_toxic','obscene','threat','insult','identity_hate']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_df = output_df[columns]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_df.to_csv('../data/output_xlnet.csv', index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>toxic</th>\n",
       "      <th>severe_toxic</th>\n",
       "      <th>obscene</th>\n",
       "      <th>threat</th>\n",
       "      <th>insult</th>\n",
       "      <th>identity_hate</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>00001cee341fdb12</th>\n",
       "      <td>0.996634</td>\n",
       "      <td>0.316317</td>\n",
       "      <td>0.990423</td>\n",
       "      <td>0.015324</td>\n",
       "      <td>0.970210</td>\n",
       "      <td>0.787239</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0000247867823ef7</th>\n",
       "      <td>0.000707</td>\n",
       "      <td>0.000064</td>\n",
       "      <td>0.000170</td>\n",
       "      <td>0.000041</td>\n",
       "      <td>0.000166</td>\n",
       "      <td>0.000071</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>00013b17ad220c46</th>\n",
       "      <td>0.000600</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000183</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000178</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>00017563c3f7919a</th>\n",
       "      <td>0.000594</td>\n",
       "      <td>0.000071</td>\n",
       "      <td>0.000185</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>00017695ad8997eb</th>\n",
       "      <td>0.000619</td>\n",
       "      <td>0.000068</td>\n",
       "      <td>0.000180</td>\n",
       "      <td>0.000044</td>\n",
       "      <td>0.000175</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0001ea8717f6de06</th>\n",
       "      <td>0.000590</td>\n",
       "      <td>0.000071</td>\n",
       "      <td>0.000186</td>\n",
       "      <td>0.000046</td>\n",
       "      <td>0.000180</td>\n",
       "      <td>0.000074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>00024115d4cbde0f</th>\n",
       "      <td>0.000593</td>\n",
       "      <td>0.000071</td>\n",
       "      <td>0.000185</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>000247e83dcc1211</th>\n",
       "      <td>0.226302</td>\n",
       "      <td>0.000131</td>\n",
       "      <td>0.002582</td>\n",
       "      <td>0.000855</td>\n",
       "      <td>0.003960</td>\n",
       "      <td>0.001135</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>00025358d4737918</th>\n",
       "      <td>0.000625</td>\n",
       "      <td>0.000068</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000044</td>\n",
       "      <td>0.000174</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>00026d1092fe71cc</th>\n",
       "      <td>0.000596</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000184</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000178</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0002eadc3b301559</th>\n",
       "      <td>0.015858</td>\n",
       "      <td>0.000048</td>\n",
       "      <td>0.001624</td>\n",
       "      <td>0.000550</td>\n",
       "      <td>0.001616</td>\n",
       "      <td>0.000448</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0002f87b16116a7f</th>\n",
       "      <td>0.148970</td>\n",
       "      <td>0.000126</td>\n",
       "      <td>0.005184</td>\n",
       "      <td>0.000728</td>\n",
       "      <td>0.004418</td>\n",
       "      <td>0.001590</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0003806b11932181</th>\n",
       "      <td>0.000607</td>\n",
       "      <td>0.000069</td>\n",
       "      <td>0.000182</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000176</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0003e1cccfd5a40a</th>\n",
       "      <td>0.000594</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000185</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>00059ace3e3e9a53</th>\n",
       "      <td>0.000598</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000184</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000178</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>000634272d0d44eb</th>\n",
       "      <td>0.000616</td>\n",
       "      <td>0.000069</td>\n",
       "      <td>0.000180</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000175</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>000663aff0fffc80</th>\n",
       "      <td>0.000610</td>\n",
       "      <td>0.000069</td>\n",
       "      <td>0.000181</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000176</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>000689dd34e20979</th>\n",
       "      <td>0.003059</td>\n",
       "      <td>0.000032</td>\n",
       "      <td>0.000230</td>\n",
       "      <td>0.000061</td>\n",
       "      <td>0.000248</td>\n",
       "      <td>0.000095</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>000834769115370c</th>\n",
       "      <td>0.000593</td>\n",
       "      <td>0.000071</td>\n",
       "      <td>0.000185</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>000844b52dee5f3f</th>\n",
       "      <td>0.000603</td>\n",
       "      <td>0.000069</td>\n",
       "      <td>0.000182</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000177</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>00084da5d4ead7aa</th>\n",
       "      <td>0.000613</td>\n",
       "      <td>0.000069</td>\n",
       "      <td>0.000181</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000176</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>00091c35fa9d0465</th>\n",
       "      <td>0.981178</td>\n",
       "      <td>0.011368</td>\n",
       "      <td>0.006646</td>\n",
       "      <td>0.095204</td>\n",
       "      <td>0.083853</td>\n",
       "      <td>0.034600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>000968ce11f5ee34</th>\n",
       "      <td>0.011687</td>\n",
       "      <td>0.000039</td>\n",
       "      <td>0.001122</td>\n",
       "      <td>0.000330</td>\n",
       "      <td>0.001211</td>\n",
       "      <td>0.000327</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0009734200a85047</th>\n",
       "      <td>0.000600</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000183</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000178</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>00097b6214686db5</th>\n",
       "      <td>0.056855</td>\n",
       "      <td>0.000158</td>\n",
       "      <td>0.003972</td>\n",
       "      <td>0.001650</td>\n",
       "      <td>0.003072</td>\n",
       "      <td>0.001480</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0009aef4bd9e1697</th>\n",
       "      <td>0.000596</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000184</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000178</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>000a02d807ae0254</th>\n",
       "      <td>0.000602</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000183</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000177</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>000a6c6d4e89b9bc</th>\n",
       "      <td>0.000612</td>\n",
       "      <td>0.000069</td>\n",
       "      <td>0.000181</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000176</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>000bafe2080bba82</th>\n",
       "      <td>0.007298</td>\n",
       "      <td>0.000033</td>\n",
       "      <td>0.000589</td>\n",
       "      <td>0.000166</td>\n",
       "      <td>0.000668</td>\n",
       "      <td>0.000218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>000bf0a9894b2807</th>\n",
       "      <td>0.000595</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000185</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff3ae2e177b6bb3</th>\n",
       "      <td>0.000597</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000184</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000178</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff4109e837f7acc</th>\n",
       "      <td>0.000684</td>\n",
       "      <td>0.000065</td>\n",
       "      <td>0.000172</td>\n",
       "      <td>0.000042</td>\n",
       "      <td>0.000168</td>\n",
       "      <td>0.000072</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff4373a81ef9f2a</th>\n",
       "      <td>0.000593</td>\n",
       "      <td>0.000071</td>\n",
       "      <td>0.000185</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff460574ddbcd80</th>\n",
       "      <td>0.000607</td>\n",
       "      <td>0.000069</td>\n",
       "      <td>0.000182</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000176</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff4fc0a1555be5c</th>\n",
       "      <td>0.000609</td>\n",
       "      <td>0.000069</td>\n",
       "      <td>0.000181</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000176</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff5b9bb944d634c</th>\n",
       "      <td>0.000602</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000183</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000177</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff5c4a77fe0c05f</th>\n",
       "      <td>0.000599</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000183</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000178</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff5fb61bd637c82</th>\n",
       "      <td>0.000592</td>\n",
       "      <td>0.000071</td>\n",
       "      <td>0.000185</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff69311f306df44</th>\n",
       "      <td>0.000604</td>\n",
       "      <td>0.000069</td>\n",
       "      <td>0.000182</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000177</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff6ad63666fb304</th>\n",
       "      <td>0.996322</td>\n",
       "      <td>0.142446</td>\n",
       "      <td>0.985340</td>\n",
       "      <td>0.002262</td>\n",
       "      <td>0.224141</td>\n",
       "      <td>0.001168</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff7159b3ee95618</th>\n",
       "      <td>0.000597</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000184</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000178</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff718ffe5f05559</th>\n",
       "      <td>0.000592</td>\n",
       "      <td>0.000071</td>\n",
       "      <td>0.000186</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff7fc22a0cdccd3</th>\n",
       "      <td>0.000593</td>\n",
       "      <td>0.000071</td>\n",
       "      <td>0.000185</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000074</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff83b80284d8440</th>\n",
       "      <td>0.000613</td>\n",
       "      <td>0.000069</td>\n",
       "      <td>0.000181</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000176</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff8ef316d0c6990</th>\n",
       "      <td>0.001414</td>\n",
       "      <td>0.000041</td>\n",
       "      <td>0.000168</td>\n",
       "      <td>0.000038</td>\n",
       "      <td>0.000151</td>\n",
       "      <td>0.000069</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff8f521a7dbcd47</th>\n",
       "      <td>0.006429</td>\n",
       "      <td>0.000032</td>\n",
       "      <td>0.000505</td>\n",
       "      <td>0.000143</td>\n",
       "      <td>0.000575</td>\n",
       "      <td>0.000195</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff8f64043129fa2</th>\n",
       "      <td>0.000594</td>\n",
       "      <td>0.000071</td>\n",
       "      <td>0.000185</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff9d70fe0722906</th>\n",
       "      <td>0.877522</td>\n",
       "      <td>0.000802</td>\n",
       "      <td>0.016894</td>\n",
       "      <td>0.001512</td>\n",
       "      <td>0.010563</td>\n",
       "      <td>0.000885</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fff9fa508f400ee6</th>\n",
       "      <td>0.005560</td>\n",
       "      <td>0.000031</td>\n",
       "      <td>0.000425</td>\n",
       "      <td>0.000121</td>\n",
       "      <td>0.000486</td>\n",
       "      <td>0.000169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fffa3fae1890b40a</th>\n",
       "      <td>0.808441</td>\n",
       "      <td>0.001391</td>\n",
       "      <td>0.740775</td>\n",
       "      <td>0.000417</td>\n",
       "      <td>0.082590</td>\n",
       "      <td>0.000362</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fffa8a11c4378854</th>\n",
       "      <td>0.960904</td>\n",
       "      <td>0.002972</td>\n",
       "      <td>0.002086</td>\n",
       "      <td>0.012426</td>\n",
       "      <td>0.013130</td>\n",
       "      <td>0.009593</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fffac2a094c8e0e2</th>\n",
       "      <td>0.996867</td>\n",
       "      <td>0.222765</td>\n",
       "      <td>0.987346</td>\n",
       "      <td>0.006298</td>\n",
       "      <td>0.959652</td>\n",
       "      <td>0.386350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fffb5451268fb5ba</th>\n",
       "      <td>0.000603</td>\n",
       "      <td>0.000069</td>\n",
       "      <td>0.000182</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000177</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fffc2b34bbe61c8d</th>\n",
       "      <td>0.000595</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000185</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fffc489742ffe69b</th>\n",
       "      <td>0.976917</td>\n",
       "      <td>0.004844</td>\n",
       "      <td>0.231968</td>\n",
       "      <td>0.000729</td>\n",
       "      <td>0.956102</td>\n",
       "      <td>0.003972</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fffcd0960ee309b5</th>\n",
       "      <td>0.110031</td>\n",
       "      <td>0.000138</td>\n",
       "      <td>0.003250</td>\n",
       "      <td>0.001035</td>\n",
       "      <td>0.003886</td>\n",
       "      <td>0.001557</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fffd7a9a6eb32c16</th>\n",
       "      <td>0.000628</td>\n",
       "      <td>0.000068</td>\n",
       "      <td>0.000178</td>\n",
       "      <td>0.000044</td>\n",
       "      <td>0.000173</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fffda9e8d6fafa9e</th>\n",
       "      <td>0.000597</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000184</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000178</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>fffe8f1340a79fc2</th>\n",
       "      <td>0.000595</td>\n",
       "      <td>0.000070</td>\n",
       "      <td>0.000184</td>\n",
       "      <td>0.000045</td>\n",
       "      <td>0.000179</td>\n",
       "      <td>0.000073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ffffce3fb183ee80</th>\n",
       "      <td>0.982582</td>\n",
       "      <td>0.015749</td>\n",
       "      <td>0.958704</td>\n",
       "      <td>0.000485</td>\n",
       "      <td>0.527072</td>\n",
       "      <td>0.000640</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>153164 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     toxic  severe_toxic   obscene    threat    insult  \\\n",
       "id                                                                       \n",
       "00001cee341fdb12  0.996634  0.316317      0.990423  0.015324  0.970210   \n",
       "0000247867823ef7  0.000707  0.000064      0.000170  0.000041  0.000166   \n",
       "00013b17ad220c46  0.000600  0.000070      0.000183  0.000045  0.000178   \n",
       "00017563c3f7919a  0.000594  0.000071      0.000185  0.000045  0.000179   \n",
       "00017695ad8997eb  0.000619  0.000068      0.000180  0.000044  0.000175   \n",
       "0001ea8717f6de06  0.000590  0.000071      0.000186  0.000046  0.000180   \n",
       "00024115d4cbde0f  0.000593  0.000071      0.000185  0.000045  0.000179   \n",
       "000247e83dcc1211  0.226302  0.000131      0.002582  0.000855  0.003960   \n",
       "00025358d4737918  0.000625  0.000068      0.000179  0.000044  0.000174   \n",
       "00026d1092fe71cc  0.000596  0.000070      0.000184  0.000045  0.000178   \n",
       "0002eadc3b301559  0.015858  0.000048      0.001624  0.000550  0.001616   \n",
       "0002f87b16116a7f  0.148970  0.000126      0.005184  0.000728  0.004418   \n",
       "0003806b11932181  0.000607  0.000069      0.000182  0.000045  0.000176   \n",
       "0003e1cccfd5a40a  0.000594  0.000070      0.000185  0.000045  0.000179   \n",
       "00059ace3e3e9a53  0.000598  0.000070      0.000184  0.000045  0.000178   \n",
       "000634272d0d44eb  0.000616  0.000069      0.000180  0.000045  0.000175   \n",
       "000663aff0fffc80  0.000610  0.000069      0.000181  0.000045  0.000176   \n",
       "000689dd34e20979  0.003059  0.000032      0.000230  0.000061  0.000248   \n",
       "000834769115370c  0.000593  0.000071      0.000185  0.000045  0.000179   \n",
       "000844b52dee5f3f  0.000603  0.000069      0.000182  0.000045  0.000177   \n",
       "00084da5d4ead7aa  0.000613  0.000069      0.000181  0.000045  0.000176   \n",
       "00091c35fa9d0465  0.981178  0.011368      0.006646  0.095204  0.083853   \n",
       "000968ce11f5ee34  0.011687  0.000039      0.001122  0.000330  0.001211   \n",
       "0009734200a85047  0.000600  0.000070      0.000183  0.000045  0.000178   \n",
       "00097b6214686db5  0.056855  0.000158      0.003972  0.001650  0.003072   \n",
       "0009aef4bd9e1697  0.000596  0.000070      0.000184  0.000045  0.000178   \n",
       "000a02d807ae0254  0.000602  0.000070      0.000183  0.000045  0.000177   \n",
       "000a6c6d4e89b9bc  0.000612  0.000069      0.000181  0.000045  0.000176   \n",
       "000bafe2080bba82  0.007298  0.000033      0.000589  0.000166  0.000668   \n",
       "000bf0a9894b2807  0.000595  0.000070      0.000185  0.000045  0.000179   \n",
       "...                    ...       ...           ...       ...       ...   \n",
       "fff3ae2e177b6bb3  0.000597  0.000070      0.000184  0.000045  0.000178   \n",
       "fff4109e837f7acc  0.000684  0.000065      0.000172  0.000042  0.000168   \n",
       "fff4373a81ef9f2a  0.000593  0.000071      0.000185  0.000045  0.000179   \n",
       "fff460574ddbcd80  0.000607  0.000069      0.000182  0.000045  0.000176   \n",
       "fff4fc0a1555be5c  0.000609  0.000069      0.000181  0.000045  0.000176   \n",
       "fff5b9bb944d634c  0.000602  0.000070      0.000183  0.000045  0.000177   \n",
       "fff5c4a77fe0c05f  0.000599  0.000070      0.000183  0.000045  0.000178   \n",
       "fff5fb61bd637c82  0.000592  0.000071      0.000185  0.000045  0.000179   \n",
       "fff69311f306df44  0.000604  0.000069      0.000182  0.000045  0.000177   \n",
       "fff6ad63666fb304  0.996322  0.142446      0.985340  0.002262  0.224141   \n",
       "fff7159b3ee95618  0.000597  0.000070      0.000184  0.000045  0.000178   \n",
       "fff718ffe5f05559  0.000592  0.000071      0.000186  0.000045  0.000179   \n",
       "fff7fc22a0cdccd3  0.000593  0.000071      0.000185  0.000045  0.000179   \n",
       "fff83b80284d8440  0.000613  0.000069      0.000181  0.000045  0.000176   \n",
       "fff8ef316d0c6990  0.001414  0.000041      0.000168  0.000038  0.000151   \n",
       "fff8f521a7dbcd47  0.006429  0.000032      0.000505  0.000143  0.000575   \n",
       "fff8f64043129fa2  0.000594  0.000071      0.000185  0.000045  0.000179   \n",
       "fff9d70fe0722906  0.877522  0.000802      0.016894  0.001512  0.010563   \n",
       "fff9fa508f400ee6  0.005560  0.000031      0.000425  0.000121  0.000486   \n",
       "fffa3fae1890b40a  0.808441  0.001391      0.740775  0.000417  0.082590   \n",
       "fffa8a11c4378854  0.960904  0.002972      0.002086  0.012426  0.013130   \n",
       "fffac2a094c8e0e2  0.996867  0.222765      0.987346  0.006298  0.959652   \n",
       "fffb5451268fb5ba  0.000603  0.000069      0.000182  0.000045  0.000177   \n",
       "fffc2b34bbe61c8d  0.000595  0.000070      0.000185  0.000045  0.000179   \n",
       "fffc489742ffe69b  0.976917  0.004844      0.231968  0.000729  0.956102   \n",
       "fffcd0960ee309b5  0.110031  0.000138      0.003250  0.001035  0.003886   \n",
       "fffd7a9a6eb32c16  0.000628  0.000068      0.000178  0.000044  0.000173   \n",
       "fffda9e8d6fafa9e  0.000597  0.000070      0.000184  0.000045  0.000178   \n",
       "fffe8f1340a79fc2  0.000595  0.000070      0.000184  0.000045  0.000179   \n",
       "ffffce3fb183ee80  0.982582  0.015749      0.958704  0.000485  0.527072   \n",
       "\n",
       "                  identity_hate  \n",
       "id                               \n",
       "00001cee341fdb12  0.787239       \n",
       "0000247867823ef7  0.000071       \n",
       "00013b17ad220c46  0.000073       \n",
       "00017563c3f7919a  0.000073       \n",
       "00017695ad8997eb  0.000073       \n",
       "0001ea8717f6de06  0.000074       \n",
       "00024115d4cbde0f  0.000074       \n",
       "000247e83dcc1211  0.001135       \n",
       "00025358d4737918  0.000073       \n",
       "00026d1092fe71cc  0.000073       \n",
       "0002eadc3b301559  0.000448       \n",
       "0002f87b16116a7f  0.001590       \n",
       "0003806b11932181  0.000073       \n",
       "0003e1cccfd5a40a  0.000073       \n",
       "00059ace3e3e9a53  0.000073       \n",
       "000634272d0d44eb  0.000073       \n",
       "000663aff0fffc80  0.000073       \n",
       "000689dd34e20979  0.000095       \n",
       "000834769115370c  0.000074       \n",
       "000844b52dee5f3f  0.000073       \n",
       "00084da5d4ead7aa  0.000073       \n",
       "00091c35fa9d0465  0.034600       \n",
       "000968ce11f5ee34  0.000327       \n",
       "0009734200a85047  0.000073       \n",
       "00097b6214686db5  0.001480       \n",
       "0009aef4bd9e1697  0.000073       \n",
       "000a02d807ae0254  0.000073       \n",
       "000a6c6d4e89b9bc  0.000073       \n",
       "000bafe2080bba82  0.000218       \n",
       "000bf0a9894b2807  0.000073       \n",
       "...                    ...       \n",
       "fff3ae2e177b6bb3  0.000073       \n",
       "fff4109e837f7acc  0.000072       \n",
       "fff4373a81ef9f2a  0.000074       \n",
       "fff460574ddbcd80  0.000073       \n",
       "fff4fc0a1555be5c  0.000073       \n",
       "fff5b9bb944d634c  0.000073       \n",
       "fff5c4a77fe0c05f  0.000073       \n",
       "fff5fb61bd637c82  0.000074       \n",
       "fff69311f306df44  0.000073       \n",
       "fff6ad63666fb304  0.001168       \n",
       "fff7159b3ee95618  0.000073       \n",
       "fff718ffe5f05559  0.000074       \n",
       "fff7fc22a0cdccd3  0.000074       \n",
       "fff83b80284d8440  0.000073       \n",
       "fff8ef316d0c6990  0.000069       \n",
       "fff8f521a7dbcd47  0.000195       \n",
       "fff8f64043129fa2  0.000073       \n",
       "fff9d70fe0722906  0.000885       \n",
       "fff9fa508f400ee6  0.000169       \n",
       "fffa3fae1890b40a  0.000362       \n",
       "fffa8a11c4378854  0.009593       \n",
       "fffac2a094c8e0e2  0.386350       \n",
       "fffb5451268fb5ba  0.000073       \n",
       "fffc2b34bbe61c8d  0.000073       \n",
       "fffc489742ffe69b  0.003972       \n",
       "fffcd0960ee309b5  0.001557       \n",
       "fffd7a9a6eb32c16  0.000073       \n",
       "fffda9e8d6fafa9e  0.000073       \n",
       "fffe8f1340a79fc2  0.000073       \n",
       "ffffce3fb183ee80  0.000640       \n",
       "\n",
       "[153164 rows x 6 columns]"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_csv('../data/output_xlnet.csv', index_col='id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
